import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
#from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
from IPython.display import Image
from sklearn import tree
from os import system
myDF=pd.read_csv('bank-full.csv')
myDF.head(10)
myDF.tail(10)
myDF.shape
myDF.describe().transpose()
myDF.nunique()
def missing_check(df):
total = df.isnull().sum().sort_values(ascending=False) # total number of null values
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False) # percentage of values that are null
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) # putting the above two together
return missing_data # return the dataframe
missing_check(myDF)
myDF.info()
#Convert the columns with an 'object' datatype into categorical variables
for i in myDF.columns:
if myDF[i].dtype=='object':
myDF[i]=pd.Categorical(myDF[i])
myDF.info()
# distplot for continous columns
for i in myDF.columns:
if myDF[i].dtype =='int64':
sns.distplot(myDF[i])
plt.show()
sns.boxplot(data=myDF,x=myDF['age']);
Q1=myDF['age'].quantile(0.25)
Q3=myDF['age'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker_age = Q1-(1.5 * IQR)
Upper_Whisker_age = Q3 + (1.5*IQR)
print(Lower_Whisker_age, Upper_Whisker_age)
#Getting the number of records above the upper whisker age of 70.5 and with job =unknown. These can be changed to "retired"
#='retired'
myDF.loc[(myDF['age']>Upper_Whisker_age) & (myDF['job']=='unknown'),'job']='retired'
myDF['pdays'].value_counts(normalize=True)
Insights: Most the pdays value is -1 and the rest have extremely low percentages. Changing this to a boolean category 'contacted_before'
myDF['contacted_before']=np.where(myDF['pdays']>0,'Yes','No')
myDF.head()
myDF['contacted_before']=pd.Categorical(myDF['contacted_before'])
myDF['contacted_before'].value_counts(normalize=True)
#Dropping the pdays column since the data is represented in the contacted_before column
myDF.drop(['pdays'],axis=1,inplace=True)
myDF['previous'].value_counts(normalize=True)
myDF[myDF['previous']==275]
myDF.drop(myDF.index[29182], inplace=True)
myDF[myDF['previous']==275]
#changing Previous to categorical previous_contact Yes or no.
myDF['previous_contact']=np.where(myDF['previous']>0,'Yes','No')
myDF.head()
myDF['previous_contact']=pd.Categorical(myDF['previous_contact'])
myDF['previous_contact'].value_counts(normalize=True)
#Dropping the pdays column since the data is represented in the contacted_before column
myDF.drop(['previous'],axis=1,inplace=True)
myDF['campaign'].value_counts(normalize=True)
sns.boxplot(data=myDF,x=myDF['campaign']);
Q1=myDF['campaign'].quantile(0.25)
Q3=myDF['campaign'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker = Q1-(1.5 * IQR)
Upper_Whisker = Q3 + (1.5*IQR)
print(Lower_Whisker, Upper_Whisker)
#myDF=myDF[myDF['campaign']<Upper_Whisker]
#Instead of a numerical, grouping the Campaign calls into 3 groups. Called less than 4 times, 4 to 6 times and greater than 6 times.
myDF['campaign_3_orless']= np.where(myDF['campaign']<4,1,0)
myDF['campaign_3_to_6']= np.where((myDF['day']>3) & (myDF['day']<7),1,0)
myDF['campaign_morethan6']= np.where(myDF['day']>7,1,0)
myDF.drop(['campaign'],axis=1,inplace=True)
#reviewing Days in detail
sns.pairplot(myDF[["day","Target"]], hue="Target", size=5)
#Days dont make sence to be a continuous variable. Will group them 3 columns days_1_10, days_11_20, days_21_31
#myDF['days_1_10']=np.where(myDF['day'])
myDF['day_w_1_10']= np.where(myDF['day']<11,1,0)
myDF['day_w_11_20']= np.where((myDF['day']>10) & (myDF['day']<21),1,0)
myDF['day_w_21_31']= np.where((myDF['day']>20) & (myDF['day']<32),1,0)
myDF.drop(['day'],axis=1,inplace=True)
myDF.dtypes
for i in ['job','marital','education','default','housing','loan','contact','contacted_before','previous_contact','campaign_3_orless','campaign_3_to_6','campaign_morethan6', 'month','poutcome', 'Target']:
plt.xticks(rotation=90)
sns.countplot(myDF[i],hue=myDF['Target'])
plt.show()
for i in list(myDF.columns[myDF.dtypes=='category']): # checking value counts of all object type columns
print(myDF[i].value_counts(normalize=True))
print()
#Dropping poutcome - unknown is 81% of the data i.e. most of the data is uknown
myDF.drop(['poutcome'],axis=1,inplace=True)
#If education is primary, lets assume that the unknown job is blue-collar
myDF.loc[(myDF['job']=='unknown') & (myDF['education']=='primary'),'job']='blue-collar'
myDF.loc[(myDF['age']>90)]
plt.figure(figsize=(12,8))
sns.heatmap(myDF.corr(),annot=True,fmt='.2f',cmap='rainbow', )
plt.show()
myDF.groupby(["Target"]).mean()
myDF.groupby(["Target"]).median()
myDF.dtypes
pd.crosstab(myDF['job'],myDF['Target'],normalize='index')
pd.crosstab(myDF['marital'],myDF['Target'],normalize='index')
pd.crosstab(myDF['education'],myDF['Target'],normalize='index')
pd.crosstab(myDF['default'],myDF['Target'],normalize='index')
pd.crosstab(myDF['housing'],myDF['Target'],normalize='index')
pd.crosstab(myDF['loan'],myDF['Target'],normalize='index')
pd.crosstab(myDF['contact'],myDF['Target'],normalize='index')
pd.crosstab(myDF['month'],myDF['Target'],normalize='index')
sns.pairplot(myDF, hue="Target", palette="husl");
myDF.head()
dict = {'jan' : 'Q1','feb' : 'Q1','mar' : 'Q1', 'apr' : 'Q2','may' : 'Q2','jun' : 'Q2','jul' : 'Q3','aug' : 'Q3','sep' : 'Q3','oct' : 'Q4','nov' : 'Q4','dec' : 'Q4' }
dict
dict_y={'no':0,'yes':1}
myDF.replace({"Target":dict_y},inplace=True)
myDF.head(10)
pd.crosstab(myDF['month'],myDF['Target'],normalize='index')
sns.countplot(myDF["month"])
plt.show()
#Before creating dummies, we need to drop day_w_21_31 (or one of the other 2 "day columns") since the data is implicitly in the other 2 day columns
#Same for
myDF.drop(['day_w_21_31'],axis=1,inplace=True)
myDF.drop(['campaign_morethan6'],axis=1,inplace=True)
## Define X and Y variables
X = myDF.drop('Target', axis=1)
Y = myDF['Target']
X.head()
X = pd.get_dummies(X, drop_first=True)
X.dtypes
plt.figure(figsize=(30,30))
sns.heatmap(X.corr(),annot=True,fmt='.2f',cmap='rainbow', )
plt.show()
Y.head()
##Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,random_state=1)
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=1)
logreg.fit(X_train, y_train) # fit the model on train data
y_predict = logreg.predict(X_test)
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
cm = confusion_matrix( actual, predicted)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
#Create Dataframe for results
myResults =pd.DataFrame(columns=['Algorithm','Training Accuracy','Testing Accuracy','Recall','Precision','F1 Score','ROC AUC Score'])
myResults
#Create function for Appending to Dataframe
def populate_result(algorithm,training_accuracy,testing_accuracy,recall,precision,f1,roc_auc):
global myResults
myResults=myResults.append({'Algorithm':algorithm,'Training Accuracy':training_accuracy,'Testing Accuracy':testing_accuracy,'Recall':recall,'Precision':precision,'F1 Score':f1,'ROC AUC Score':roc_auc},ignore_index=True)
training_accuracy=logreg.score(X_train,y_train)
testing_accuracy=logreg.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Logistic Regression', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
myResults.round(decimals=2)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=2)
# !pip install yellowbrick
# Additional
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
## Feature Importance or Coefficients
fi = pd.DataFrame()
fi['Col'] = X_train.columns
fi['Coeff'] = np.round(abs(logreg.coef_[0]),2)
fi.sort_values(by='Coeff',ascending=False)
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
#scoring the decision tree
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
y_predict = dTree.predict(X_test)
training_accuracy=dTree.score(X_train,y_train)
testing_accuracy=dTree.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Decision Tree', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
train_char_label = ['No', 'Yes']
Term_Deposit_File = open('Term_Deposit.dot','w')
dot_data = tree.export_graphviz(dTree, out_file=Term_Deposit_File, feature_names = list(X_train), class_names = list(train_char_label))
Term_Deposit_File.close()
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng Term_Deposit.dot -o Term_Deposit.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("Term_Deposit.png"))
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
train_char_label = ['No', 'Yes']
Term_Deposit_FileR = open('Term_DepositR.dot','w')
dot_data = tree.export_graphviz(dTreeR, out_file=Term_Deposit_FileR, feature_names = list(X_train), class_names = list(train_char_label))
Term_Deposit_FileR.close()
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng Term_DepositR.dot -o Term_DepositR.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("Term_DepositR.png"))
## Feature Importance or Coefficients
fi2 = pd.DataFrame()
fi2['Col'] = X_train.columns
#fi['Coeff'] = np.round(abs(logreg.coef_[0]),2)
fi2['Coeff'] = np.round(abs(dTreeR.feature_importances_),2)
fi2.sort_values(by='Coeff',ascending=False)
#print(dTreeR.score(X_test , y_test))
y_predict = dTreeR.predict(X_test)
training_accuracy=dTreeR.score(X_train,y_train)
testing_accuracy=dTreeR.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Decision Tree -Gini-Depth =3', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 5, random_state=1)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
y_predict = dTreeR.predict(X_test)
training_accuracy=dTreeR.score(X_train,y_train)
testing_accuracy=dTreeR.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Decision Tree -Gini-Depth =5', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
dTreeRE = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, random_state=1)
dTreeRE.fit(X_train, y_train)
print(dTreeRE.score(X_train, y_train))
print(dTreeRE.score(X_test, y_test))
y_predict = dTreeRE.predict(X_test)
training_accuracy=dTreeRE.score(X_train,y_train)
testing_accuracy=dTreeRE.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Decision Tree -Entropy-Depth =5', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTreeR, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
training_accuracy=bgcl.score(X_train,y_train)
testing_accuracy=bgcl.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Bagging', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
training_accuracy=abcl.score(X_train,y_train)
testing_accuracy=abcl.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('AdaBoosting', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
training_accuracy=gbcl.score(X_train,y_train)
testing_accuracy=gbcl.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Gradient Boosting', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
training_accuracy=rfcl.score(X_train,y_train)
testing_accuracy=rfcl.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('Random Forest', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
myResults.round(decimals=4)
from xgboost import XGBClassifier
bst = XGBClassifier()
bst=bst.fit(X_train, y_train ,)
y_predict = bst.predict(X_test)
training_accuracy=rfcl.score(X_train,y_train)
testing_accuracy=rfcl.score(X_test, y_test)
recall=recall_score(y_test,y_predict)
precision=precision_score(y_test,y_predict)
f1=f1_score(y_test,y_predict)
roc_auc=roc_auc_score(y_test,y_predict)
populate_result('XGBoost', training_accuracy,testing_accuracy,recall,precision,f1,roc_auc)
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
## Feature Importance or Coefficients
fi3 = pd.DataFrame()
fi3['Col'] = X_train.columns
fi3['Coeff'] = np.round(abs(bst.feature_importances_),2)
fi3.sort_values(by='Coeff',ascending=False)
myResults.round(decimals=4)
All of the models other than the unpruned decision tree gave similar Testing accuracy. However, out of all of the methods, XGBoost did give the best overall results when Recall, Precision, F1 score and ROC AUC Score are taken into account.
I reviewed the importance of coeffeciants under the Logistic Regression , Regularized Decision Tree (depth 3) and XG boost. What was common was that we should target candidates which housing loans and who were previously contacted.
What is interesting is that all of the processes above had several coeffeciants which other algoritms deemed to not as significant.